This project almost places focus on data of BRFSS in 2013 by analyzing, visualizing and using techniques (ex: hypothesis testing) in order to answer some questions. The Behavioral Risk Factor Surveillance System (BRFSS) is a collaborative project between all of the states in the United States (US) and participating US territories and the Centers for Disease Control and Prevention (CDC). The BRFSS is administered and supported by CDC’s Population Health Surveillance Branch, under the Division of Population Health at the National Center for Chronic Disease Prevention and Health Promotion. BRFSS is an ongoing surveillance system designed to measure behavioral risk factors for the non-institutionalized adult population (18 years of age and older) residing in the US.
The brief summary of BRFSS data is described in the following HTML file (Note:you have to download to open the file)
The data includes the following:
MAIN SURVEY: Main survey is divided into 17 Sections
Section 0 - Record Identification
Section 1 - Health Status
Section 2 - Healthy Days - Health-Related Quality of Life
Section 3 - Health Care Access
Section 4 - Inadequate Sleep
Section 6 - Cholesterol Awareness
Section 7 - Chronic Health Conditions
Section 8 - Demographics
Section 9 - Tobacco Use
Section 10 - Alcohol Consumption
Section 11 - Fruits and Vegetables
Section 12 - Exercise (Physical Activity)
Section 13 - Arthritis Burden
Section 14 - Seatbelt Use
Section 15 - Immunization
Section 16 - HIV/AIDS
OPTIONAL MODULE: Optional module contains 22 small modules. However, we will not use data from this part.
# LOAD PACKAGE AND DATA @-@
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(cowplot)
## Warning: package 'cowplot' was built under R version 4.0.3
library(plotly)
## Warning: package 'plotly' was built under R version 4.0.3
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
load('brfss2013.RData')
# This question we use 2 main attributes:
# genhlth: General Health
# sex: respondents sex
str(brfss2013$genhlth)
## Factor w/ 5 levels "Excellent","Very good",..: 4 3 3 2 3 2 4 3 1 3 ...
str(brfss2013$sex)
## Factor w/ 2 levels "Male","Female": 2 2 2 2 1 2 2 2 1 2 ...
female_Q1<-brfss2013 %>%
filter(as.integer(sex)==2,!is.na(genhlth))
male_Q1<-brfss2013 %>%
filter(as.integer(sex)==1,!is.na(genhlth))
sum_F<-female_Q1 %>%
count(genhlth)
sum_F
## genhlth n
## 1 Excellent 49740
## 2 Very good 93940
## 3 Good 87557
## 4 Fair 40844
## 5 Poor 17238
sum_M<-male_Q1 %>%
count(genhlth)
sum_M
## genhlth n
## 1 Excellent 35741
## 2 Very good 65135
## 3 Good 62998
## 4 Fair 25882
## 5 Poor 10713
plot_F<-ggplot(sum_F, aes(x = "", y = n, fill = genhlth)) +
geom_bar(width = 1, stat = "identity", color = "white") +
coord_polar("y", start = 0)+
theme_void()
plot_M<-ggplot(sum_M, aes(x = "", y = n, fill = genhlth)) +
geom_bar(width = 1, stat = "identity", color = "white") +
coord_polar("y", start = 0)+
theme_void()
plot_grid(plot_F,plot_M)
# INTRO
# This question we use 3 main attributes:
# educa: Enducation level
# income2: Income level
# employ1: Employment Status
str(brfss2013$educa)
## Factor w/ 6 levels "Never attended school or only kindergarten",..: 6 5 6 4 6 6 4 5 6 4 ...
str(brfss2013$income2)
## Factor w/ 8 levels "Less than $10,000",..: 7 8 8 7 6 8 NA 6 8 4 ...
str(brfss2013$employ1)
## Factor w/ 8 levels "Employed for wages",..: 7 1 1 7 7 1 1 7 7 5 ...
income_Q2<-brfss2013 %>%
filter(!is.na(educa),!is.na(income2))
employ_Q2<-brfss2013 %>%
filter(!is.na(educa),!is.na(employ1))
ggplot(data = income_Q2, aes(x = educa, fill = income2)) +
geom_bar()
ggplot(data = employ_Q2, aes(x = educa, fill = employ1)) +
geom_bar()
# This question we use 2 main attributes:
# sleptim1:How much time do you sleep
# decide: Difficulty concentrating or remembering
str(brfss2013$sleptim1)
## int [1:491775] NA 6 9 8 6 8 7 6 8 8 ...
str(brfss2013$decide)
## Factor w/ 2 levels "Yes","No": 2 2 2 2 2 2 2 2 2 2 ...
Q3<-brfss2013 %>%
filter(!is.na(sleptim1),!is.na(decide))
Q3 %>%
group_by(decide)%>%
summarise(mean=mean(sleptim1), median=median(sleptim1),Var=var(sleptim1),IQR=IQR(sleptim1),n=n())
## `summarise()` ungrouping output (override with `.groups` argument)
## Warning: `...` is not empty.
##
## We detected these problematic arguments:
## * `needs_dots`
##
## These dots only exist to allow future extensions and should be empty.
## Did you misspecify an argument?
## # A tibble: 2 x 6
## decide mean median Var IQR n
## <fct> <dbl> <dbl> <dbl> <dbl> <int>
## 1 Yes 6.75 7 4.67 3 48860
## 2 No 7.09 7 1.83 2 423099
ggplot(Q3, aes(x=decide, y=sleptim1, color=decide)) +
geom_boxplot()+
labs(title="Plot of concentration difficulty and # of sleep",x="have concentration difficulty", y = "Hour of sleep")+
theme_classic()
# This question we use 4 main attributes:
# sleptim1:How much time do you sleep
# physhlth:Number Of Days Physical Health Not Good
# menthlth:Number Of Days Mental Health Not Good
# genhlth: General Health
str(brfss2013$sleptim1)
## int [1:491775] NA 6 9 8 6 8 7 6 8 8 ...
str(brfss2013$physhlth)
## int [1:491775] 30 0 3 2 10 0 1 5 0 0 ...
str(brfss2013$menthlth)
## int [1:491775] 29 0 2 0 2 0 15 0 0 0 ...
str(brfss2013$genhlth)
## Factor w/ 5 levels "Excellent","Very good",..: 4 3 3 2 3 2 4 3 1 3 ...
health<-brfss2013 %>%
filter(!is.na(sleptim1),!is.na(physhlth),!is.na(menthlth),!is.na(genhlth),sleptim1<=24,menthlth<=31)
# Physical health
ggplot(health,aes(sleptim1,physhlth))+
geom_hex(bins=40)+
labs(colour="count", x="hours of sleep", y="# of days physical health not good")+
scale_fill_continuous(type="viridis") +
theme_bw()
# Mental health
ggplot(health,aes(sleptim1,menthlth))+
geom_hex(bins=40)+
labs(colour="count", x="hours of sleep", y="# of days mental health not good")+
scale_fill_continuous(type="viridis") +
theme_bw()
# General health
ggplot(health,aes(sleptim1,genhlth)) +
geom_hex(bins=40)+
labs(colour="count", x="hours of sleep", y="general health")+
scale_fill_continuous(type="viridis") +
theme_bw()
# This question we use 2 main attributes:
# sex: Respondents Sex
# weight2: Reported Weight In Pounds
str(brfss2013$sex)
## Factor w/ 2 levels "Male","Female": 2 2 2 2 1 2 2 2 1 2 ...
str(brfss2013$weight2)
## Factor w/ 570 levels "",".b","100",..: 154 30 63 31 169 128 9 1 139 73 ...
Male<-brfss2013 %>%
filter(!is.na(weight2),!is.na(sex),as.integer(weight2)<300,as.integer(sex)==1)
Female<-brfss2013 %>%
filter(!is.na(weight2),!is.na(sex),as.integer(weight2)<300,as.integer(sex)==2)
Male_weight<-as.integer(Male$weight2)
Female_weight<-as.integer(Female$weight2)
M <- density(Male_weight)
plot(M, main="Weight of Male")
polygon(M, col="blue4", border="black")
# so # of observation for male is 200245
n<-200245
F <- density(Female_weight)
plot(F, main="Weight of Female")
polygon(F, col="coral", border="black")
# so # of observation for female is 287093
m<-287093
mean_X<-mean(Male_weight)
mean_Y<-mean(Female_weight)
V_X<- (sd(Male_weight))^2
V_Y<- (sd(Female_weight))^2
P<-matrix(c(mean_X,V_X,mean_Y,V_Y),2,2)
dimnames(P)<-list(c("mean","sample var"),c("X","Y"))
P
## X Y
## mean 98.17198 61.52105
## sample var 1781.53227 1697.22858
Test_stat<-(mean_X-mean_Y-37)/sqrt((V_X/n)+(V_Y/m))
Test_stat
## [1] -2.868522
1- pnorm(abs(Test_stat))
## [1] 0.002061974
str(brfss2013$exerany2)
## Factor w/ 2 levels "Yes","No": 2 1 2 1 2 1 1 1 1 1 ...
str(brfss2013$poorhlth)
## int [1:491775] 30 NA 0 0 0 NA 0 10 NA NA ...
str(brfss2013$fruit1)
## int [1:491775] 104 301 203 306 302 206 325 320 101 202 ...
str(brfss2013$sleptim1)
## int [1:491775] NA 6 9 8 6 8 7 6 8 8 ...
data<-brfss2013 %>%
filter(!is.na(exerany2),!is.na(poorhlth),!is.na(fruit1),!is.na(sleptim1),poorhlth<=30,sleptim1<24,fruit1<400)
boxplot( poorhlth~exerany2, data = data)
plot1<-ggplot(data,aes(sleptim1,poorhlth))+
geom_hex(bins=40)+
labs(colour="count", x="hours of sleep", y="# of days mental health not good")+
scale_fill_continuous(type="viridis") +
theme_bw()
plot2<-ggplot(data,aes(fruit1,poorhlth))+
geom_hex(bins=40)+
labs(colour="count", x="hours of sleep", y="# of days mental health not good")+
scale_fill_continuous(type="viridis") +
theme_bw()
plot_grid(plot1, plot2, labels = "AUTO")
plot_ly(data= data, z = ~poorhlth, x = ~fruit1, y = ~sleptim1, color = ~exerany2, colors = c('deeppink4','cyan1'),opacity = 0.5) %>%
add_markers( marker = list(size = 2))
## Warning: `arrange_()` is deprecated as of dplyr 0.7.0.
## Please use `arrange()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.